setwd("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2_S1000Figs/")## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
##
## Attaching package: 'egg'
## The following object is masked from 'package:ggpubr':
##
## ggarrange
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
factorA <- c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
labelsA <- c("Wild emmer","Domesticated emmer","Free-threshing tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
colB <- c("#ffd702","#7f5701","#016699","#00f3ff", "#fc6e6e","#9900ff","#fe63c2")
df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels=factorA,labels=labelsA)### start to plot
p <- df %>%
# filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by6_TreeValidated))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Genetic group")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
fig <- df %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by6_TreeValidated,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/005_matrix_Bsub.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
factorA <- c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
labelsA <- c("Wild emmer","Domesticated emmer","Free-threshing tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
colB <- c("#ffd702","#7f5701","#016699","#00f3ff", "#fc6e6e","#9900ff","#fe63c2")
df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels=factorA,labels=labelsA)### start to plot
p <- df %>%
# filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by6_TreeValidated))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Genetic group")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
library(plotly)
fig <- df %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by6_TreeValidated,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/006_matrix_Dsub.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)## Warning in PCA(df[, -c(c1)], graph = F): Missing values are imputed by the mean
## of the variable: you should use the imputePCA function of the missMDA package
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
factorA <- c("Strangulata","Landrace","Cultivar","OtherHexaploids")
labelsA <- c("Strangulata","Landrace","Cultivar","OtherHexaploids")
colB <- c("#87cef9","#fc6e6e","#9900ff","#fe63c2")
df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels=factorA,labels=labelsA)### start to plot
p <- df %>%
# filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by6_TreeValidated))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Genetic group")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggplotly(p)# library(plotly)
# fig <- df %>%
# plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by6_TreeValidated,
# text = ~paste(Taxa),
# colors = colB)
#
# figlibrary(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/001_matrix_hexa.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
### 设置因子顺序及颜色
factorA <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta")
labelsA <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt")
colB <- c("#fc6e6e","#9900ff","#fe63c2","gray","#E41A1C","#984EA3","#FF7F00","#FFFF33","#A65628","#377EB8","#4DAF4A")
df$Subspecies_by22 <- factor(df$Subspecies_by22,levels = factorA,labels = labelsA)### start to plot
p <- df %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Taxa == "ABD_1142" | Taxa == "ABD_1143") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies %in% c("Landrace_Tibet","tibeticum")) %>%
# filter(Subspecies == "Cultivar",Group %in% c("CAAS") ) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by22))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 5,width = 5)
library(plotly)
fig <- df %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/002_matrix_tetra.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
factorA <- c("dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum")
labelsA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat")
colB <- c("#ffd702","#7f5701","#016699","#66A61E","#E7298A","#666666","#7570B3","#D95F02","cyan") ### 色系来自 Dark2
df$Subspecies_by22 <- factor(df$Subspecies_by22,levels = factorA,labels = labelsA)p <- df %>%
filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
# filter(Subspecies == "dicoccoides") %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill=Subspecies_by22))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Tetraploid by subspecies")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 3)
library(plotly)
fig <- df %>%
filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/001_matrix_hexa.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)colB <- c('#7B241C','#dbb3ff','#A9A9A9','#fc6e6e','#FF9900','#7dbde8','#82C782')
shapeB <- c(16,8,18,6,15,7,17)
### start to plot
p <- df %>%
filter(!Continent_forTree=="") %>%
ggplot(aes(x=Dim.1,y=Dim.2))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(aes(col=Continent_forTree,shape=Continent_forTree),alpha=0.8,size=1.5)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
scale_color_manual(values = colB)+
scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
pggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
library(plotly)
fig <- df %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/002_matrix_tetra.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)colB <- c(c('#7B241C','#dbb3ff','#A9A9A9','#fc6e6e','#FF9900','#82C782'))
shapeB <- c(16,8,18,6,15,17)
p <- df %>%
filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill=Subspecies_by22))+
geom_point(aes(col=Continent_forTree,shape=Continent_forTree),alpha=0.8,size=1.5)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
scale_color_manual(values = colB)+
scale_shape_manual(values = shapeB)+ labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
p## Warning: Removed 6 rows containing missing values (geom_point).
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 3)## Warning: Removed 6 rows containing missing values (geom_point).
library(plotly)
fig <- df %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22,
text = ~paste(Taxa),
colors = colB)
fig## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/003_matrix_diploid.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
### strangulata 只有来自 America/ Europe/ Western Asia
colB <- c('#dbb3ff','#FF9900','#82C782')
shapeB <- c(8,15,17)### start to plot
p <- df %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by22))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
# geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_point(aes(col=Continent_forTree,shape=Continent_forTree),alpha=0.8,size=2)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
scale_color_manual(values = colB,name="Genetic group")+
scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12)) ## Scale for 'shape' is already present. Adding another scale for 'shape', which
## will replace the existing scale.
p## Warning: Removed 2 rows containing missing values (geom_point).
ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)## Warning: Removed 2 rows containing missing values (geom_point).
# library(plotly)
# fig <- df %>%
# # filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# # filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# # filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# # filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
# plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22,
# text = ~paste(Taxa),
# colors = colB)
#
# fig### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group)
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
dfmatrixA <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "A")
dataFile <- c("data/002_pca/005_matrix_Bsub.txt.gz")
dfmatrixB <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "B")
dataFile <- c("data/002_pca/006_matrix_Dsub.txt.gz")
dfmatrixD <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "D")
dfori <- rbind(dfmatrixA,dfmatrixB,dfmatrixD) %>% rename(IBSdistance2CS=ABD_1142)
df <- dfori %>% group_by(Taxa) %>%
summarise(aveIBS = mean(IBSdistance2CS,na.rm = T)) %>%
left_join(.,dftaxaDB,by="Taxa")
### 包括Other类型
factorA <- c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids","Strangulata")
labelsA <- c("Wild emmer","Domesticated emmer","Free-threshing tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata")
colC <- c("#ffd702","#7f5701","#016699","#00f3ff", "#fc6e6e","#9900ff","#fe63c2","#87cef9")
df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels = factorA,labels = labelsA)
# write.table(dfV1,file="data/002_pca/ibs.txt",quote=F,col.name=T,row.names=F,sep = "\t")colB <- c("#ffd702","#fc6e6e","#87cef9")
p <- ggplot(df,aes(x=GenomeType, y=aveIBS,fill= GenomeType))+
geom_boxplot(position = position_dodge(0.8),outlier.colour = "white",alpha=0.1)+
stat_boxplot(geom = "errorbar",width=0.12,position = position_dodge(0.5))+
geom_point(position = position_jitterdodge(0.8),alpha = 0.6,aes(color=Subspecies_by6_TreeValidated),size=1)+
labs(y="IBS distance to CS",x="")+
scale_fill_manual(values = colB)+
scale_color_manual(values = colC)+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.7)
,legend.position = 'none'
,text = element_text(size = 14))
pggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group)
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
dfmatrixA <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "A")
dataFile <- c("data/002_pca/005_matrix_Bsub.txt.gz")
dfmatrixB <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "B")
dataFile <- c("data/002_pca/006_matrix_Dsub.txt.gz")
dfmatrixD <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "D")
df <- rbind(dfmatrixA,dfmatrixB,dfmatrixD) %>%
rename(IBSdistance2CS=ABD_1142) %>%
left_join(.,dftaxaDB,by="Taxa")### 六倍体的亚群
factorHexa <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta")
labelsHexa <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt")
# colotherHexaploid <- brewer.pal(12,"Set1")[1:7]
# colB <- c("#fc6e6e","#9900ff","#fe63c2","gray",colotherHexaploid)
colBHexa <- c("#fc6e6e","#9900ff","#fe63c2","gray","#E41A1C","#984EA3","#FF7F00","#FFFF33","#A65628","#377EB8","#4DAF4A")
### 四倍体的亚群
factorTetra <- c("dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum")
labelsTetra <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat")
colBTetra <- c("#ffd702","#7f5701","#016699","#66A61E","#E7298A","#666666","#7570B3","#D95F02","cyan") ### 色系来自 Dark2
df$Subspecies_by22_TreeValidated <- factor(df$Subspecies_by22_TreeValidated,levels = c(factorTetra, factorHexa,"strangulata"),labels = c(labelsTetra,labelsHexa,"Strangulata"))
colB <- c(colBTetra,colBHexa,"#87cef9")### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- df %>% group_by(Sub,Subspecies_by22_TreeValidated) %>%
count() %>% ungroup() %>% distinct(.,Subspecies_by22_TreeValidated,.keep_all = T) %>%
select(-Sub) %>% mutate(Subspecies_by22_TreeValidated_AddCount = str_c(Subspecies_by22_TreeValidated," (", n, ")",sep = "")) %>%
select(-n)p <- ggplot(df,aes(x=Subspecies_by22_TreeValidated,y= IBSdistance2CS))+
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_violin(position = position_dodge(0.8),alpha=0.8) +
# geom_boxplot(position = position_dodge(1),outlier.color = 'white',alpha=0.8,width=0.5)+ #,width=0.2
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
ylab("IBS distance to CS")+
xlab("")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=0.25,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=3+0.5,ymin=0.25,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=0.25,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=0.25,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=10+0.5,ymin=0.25,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=0.25,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=12-0.5,xmax=20+0.5,ymin=0.25,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=0.25,ymax=Inf),fill="#87cef9",alpha=0.5)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(1),size=0.1)+
stat_summary(aes(color=Sub),fun.data=mean_sdl,position = position_dodge(0.5),geom="pointrange",size=0.3)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 20.5,linetype="dashed",color="black")+
scale_color_manual(values =c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$Subspecies_by22_TreeValidated_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12),legend.position = 'none')
p## Warning: Removed 12 rows containing non-finite values (stat_summary).
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 3.88,width = 7)## Warning: Removed 12 rows containing non-finite values (stat_summary).
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType, GroupbyContinent) %>%
filter(str_detect(GroupbyContinent,"LR") | str_detect(GroupbyContinent,"CL"))
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
dfmatrixA <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "A")
dataFile <- c("data/002_pca/005_matrix_Bsub.txt.gz")
dfmatrixB <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "B")
dataFile <- c("data/002_pca/006_matrix_Dsub.txt.gz")
dfmatrixD <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "D")
df <- rbind(dfmatrixA,dfmatrixB,dfmatrixD) %>%
rename(IBSdistance2CS=ABD_1142) %>%
right_join(.,dftaxaDB,by="Taxa")### 统计每个亚基因组内,以亚洲为单位的均值,按照B亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% group_by(Sub,GroupbyContinent) %>% summarise(mean = mean(IBSdistance2CS)) %>% arrange(.,Sub,-mean) %>% ungroup() %>% filter(Sub=="B") %>% select(GroupbyContinent)## `summarise()` has grouped output by 'Sub'. You can override using the `.groups` argument.
### 因子排序
df$GroupbyContinent <- factor(df$GroupbyContinent,levels = dfV1$GroupbyContinent)
### 计算每个亚群的个数
dftaxaCount <- df %>% group_by(Sub,GroupbyContinent) %>%
count() %>% ungroup() %>% distinct(.,GroupbyContinent,.keep_all = T) %>%
select(-Sub) %>% mutate(GroupbyContinent_AddCount = str_c(GroupbyContinent," (", n, ")",sep = "")) %>%
select(-n)### 阴影,discrete x
### 注意,如果想要加阴影,必须有scale_x_discrete 的自定义才可以
p <- ggplot(df,aes(x=GroupbyContinent,y=IBSdistance2CS))+
ylab("IBS distance to CS")+xlab("")+
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(1),size=0.1)+
stat_summary(aes(color=Sub),fun.data=mean_sdl,position = position_dodge(0.5),geom="pointrange",size=0.3)+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupbyContinent_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
# theme(plot.margin=unit(rep(0.3,4),'lines'))+
theme(plot.margin=unit(rep(1.2,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12)
,legend.position = 'none'
,legend.key = element_blank()
)
pggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 3.43,width = 4)### ********************** Section1 *********************************###
### taxa 属性数据框
factorA <- c("dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","strangulata")
labelsA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA)
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>%
mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
dfmatrixA <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "A")
dataFile <- c("data/002_pca/005_matrix_Bsub.txt.gz")
dfmatrixB <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "B")
dataFile <- c("data/002_pca/006_matrix_Dsub.txt.gz")
dfmatrixD <- read.delim(dataFile) %>% select(Taxa,ABD_1142) %>% mutate(Sub = "D")
df <- rbind(dfmatrixA,dfmatrixB,dfmatrixD) %>%
rename(IBSdistance2CS=ABD_1142) %>%
left_join(.,dftaxaDB,by="Taxa") %>%
filter(!is.na(GroupforExpansionLoad))factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = factorAo)### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- df %>%
group_by(Sub,GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
distinct(.,GroupforExpansionLoad,.keep_all = T) %>%
select(-Sub) %>%
mutate(GroupforExpansionLoad_AddCount =
str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)p <- ggplot(df,aes(x=GroupforExpansionLoad,y= IBSdistance2CS))+
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_violin(position = position_dodge(0.8),alpha=0.8) +
# geom_boxplot(position = position_dodge(1),outlier.color = 'white',alpha=0.8,width=0.5)+ #,width=0.2
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
ylab("IBS distance to CS")+
xlab("")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=0.25,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=0.25,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=0.25,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=0.25,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=0.25,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=0.25,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=0.25,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=0.25,ymax=Inf),fill="#87cef9",alpha=0.5)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(1),size=0.1)+
stat_summary(aes(color=Sub),fun.data=mean_sdl,position = position_dodge(0.5),geom="pointrange",size=0.3)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values =c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 10),legend.position = 'none')
p## Warning: Removed 12 rows containing non-finite values (stat_summary).
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 3.5,width = 7)## Warning: Removed 12 rows containing non-finite values (stat_summary).
library(brew)
subA <- c("AABB","AABBDD","DD")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
### 要去除的几个 spelt 个体
spelt <- c("ABD_1130","ABD_1131","ABD_1132","ABD_1133","ABD_1134","ABD_1135","ABD_1136","ABD_1137","ABD_1138","ABD_1139","ABD_1140","ABD_1141")
df1 <- read.delim("data/003_VCF_QC/AABBDD_taxa_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_taxa_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_taxa_QC.txt.gz")
df <- rbind(df1,df2,df3) %>% mutate(GenomeType = factor(GenomeType,levels = subA))
p <- df %>%
filter(! Taxa %in% spelt) %>%
ggplot(aes(x=GenomeType,y=HeterozygousProportion,fill = GenomeType)) +
geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(color=GenomeType),size=0.2)+
ylab("Heterozygous genotype\nproportion by taxon") + xlab("")+
stat_summary(fun=mean, geom="point", color="blue",position = position_dodge(0.8),size=1)+
scale_fill_manual(values = colB,name='')+
scale_color_manual(values = colB,name='')+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.7)
,legend.position = 'none'
,text = element_text(size = 8))
# theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))
p# ggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 3.43)
ggsave(p,filename = "~/Documents/test.pdf",height = 2.4,width = 1.8)
mean(df2$HeterozygousProportion)## [1] 0.01469433
subA <- c("AABB","AABBDD","DD")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
### 要去除的几个 spelt 个体
spelt <- c("ABD_1130","ABD_1131","ABD_1132","ABD_1133","ABD_1134","ABD_1135","ABD_1136","ABD_1137","ABD_1138","ABD_1139","ABD_1140","ABD_1141")
df1 <- read.delim("data/003_VCF_QC/AABBDD_taxa_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_taxa_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_taxa_QC.txt.gz")
df <- rbind(df1,df2,df3) %>% mutate(GenomeType = factor(GenomeType,levels = subA))
p <- df %>%
filter(! Taxa %in% spelt) %>%
ggplot(aes(x=MissRate,fill=GenomeType)) +
geom_histogram(aes(y=..ndensity..),alpha=1,position="dodge",bins = 30)+
# geom_density()+
ylab("Density") + xlab("Missing rate by taxon")+
scale_fill_manual(values = colB,name='')+
scale_color_manual(values = colB,name='')+
facet_grid(.~GenomeType)+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.4, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))+
theme(strip.background = element_rect(size = 0.4))+
# theme(strip.text = element_blank())+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 3.43)subA <- c("AABB","AABBDD","DD")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
df1 <- read.delim("data/003_VCF_QC/AABBDD_site_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_site_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_site_QC.txt.gz")
df <- rbind(df1,df2,df3) %>% mutate(GenomeType = factor(GenomeType,levels = subA))
p <- df %>%
ggplot(aes(x=MissingRate,fill=GenomeType)) +
geom_histogram(aes(y=..ndensity..),alpha=1,position="dodge",bins = 50)+
# geom_density()+
labs(x="Missing rate by site",y="Density")+
scale_fill_manual(values = colB,name='')+
scale_color_manual(values = colB,name='')+
coord_cartesian(xlim = c(0,0.5))+
facet_grid(.~GenomeType)+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.4, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))+
theme(strip.background = element_rect(size = 0.4))+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 3.43)subA <- c("AABB","AABBDD","DD")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
df1 <- read.delim("data/003_VCF_QC/AABBDD_site_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_site_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_site_QC.txt.gz")
df <- rbind(df1,df2,df3) %>% mutate(GenomeType = factor(GenomeType,levels = subA))
p <- df %>%
ggplot(aes(x=HeterozygousProportion,fill=GenomeType)) +
geom_histogram(aes(y=..ndensity..),alpha=1,position="dodge",bins = 50)+
# geom_density()+
labs(x="Heterozygous genotype proportion by site",y="Density")+
scale_fill_manual(values = colB,name='')+
scale_color_manual(values = colB,name='')+
coord_cartesian(xlim = c(0,0.5))+
facet_grid(.~GenomeType)+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.4, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))+
theme(strip.background = element_rect(size = 0.4))+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 3.43)subA <- c("AABB","AABBDD","DD")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
df1 <- read.delim("data/003_VCF_QC/AABBDD_site_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_site_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_site_QC.txt.gz")
df <- rbind(df1,df2,df3) %>% mutate(GenomeType = factor(GenomeType,levels = subA))
p <- df %>%
ggplot(aes(x=Maf,fill=GenomeType)) +
geom_histogram(aes(y=..ndensity..),alpha=1,position="dodge",bins = 50)+
# geom_density()+
labs(x="Missing rate by site",y="Density")+
scale_fill_manual(values = colB,name='')+
scale_color_manual(values = colB,name='')+
# coord_cartesian(xlim = c(0,0.5))+
facet_grid(.~GenomeType)+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.4, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))+
theme(strip.background = element_rect(size = 0.4))+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 3.43)library(cowplot) ##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
library(ggExtra)
library(forcats)
dataFile <- c("data/003_VCF_QC/siteDepth/byGenomeType/tetra_depth.txt.gz")
df1 <- read.delim(dataFile) %>% mutate(GenomeType = "AABB")
dataFile <- c("data/003_VCF_QC/siteDepth/byGenomeType/hexa_depth.txt.gz")
df2 <- read.delim(dataFile)%>% mutate(GenomeType = "AABBDD")
dataFile <- c("data/003_VCF_QC/siteDepth/byGenomeType/diploid_depth.txt.gz")
df3 <- read.delim(dataFile)%>% mutate(GenomeType = "DD")
df <- rbind(df1,df2,df3) %>%
group_by(GenomeType) %>%
slice_sample(.,n = 3000) %>%
ungroup()
colB <- c('#ffd702','#fc6e6e',"#87cef9")
df$GenomeType <- factor(df$GenomeType,levels = c("AABB","AABBDD","DD"))
p <- ggplot(df,aes(x=AverageDepth,y=SD,col=GenomeType))+
geom_point(alpha=0.5,shape=1,size=0.8)+
# scale_x_continuous(limits = c(2,14),breaks = seq(0,16,4))+
# scale_y_continuous(limits = c(2,10),breaks = seq(0,12,2))+
xlab("Mean of depth")+ylab("SD")+
scale_color_manual(values = colB)+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.7)
,legend.position = 'none'
,text = element_text(size = 12))
# theme(panel.background = element_rect(size = 0.4, colour = 'black',fill = 'white'),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),text = element_text(size = 12),legend.position = 'none')
p
p2 <- ggMarginal(p, type="density",groupColour = TRUE, groupFill = TRUE)
p2# ggsave(p2,filename = "~/Documents/test.pdf",height = 3.2,width = 3.2)
ggsave(p2,filename = "~/Documents/test.pdf",height = 4,width = 4)library(cowplot)
library(ggExtra)
library(forcats)
dataFile <- c("data/003_VCF_QC/siteDepth/bySub/chrA_vmap2.1_500k_depth.txt.gz")
df1 <- read.delim(dataFile) %>% mutate(GenomeType = "A")
dataFile <- c("data/003_VCF_QC/siteDepth/bySub/chrA_vmap2.1_500k_depth.txt.gz")
df2 <- read.delim(dataFile)%>% mutate(GenomeType = "B")
dataFile <- c("data/003_VCF_QC/siteDepth/bySub/chrA_vmap2.1_500k_depth.txt.gz")
df3 <- read.delim(dataFile)%>% mutate(GenomeType = "D")
df <- rbind(df1,df2,df3) %>%
group_by(GenomeType) %>%
slice_sample(.,n = 3000) %>%
ungroup()
#Lines
# p <- ggplot(df, aes(x=AverageDepth, y=SD)) +
# geom_hex(bins = 30, size = 0.5, color = "black") +
# scale_fill_viridis_c(option = "C")+
# scale_y_continuous(limits=c(3,8),breaks = seq(0,8,2))+
# scale_x_continuous(limits=c(6,12),breaks = seq(0,12,2))+
# labs(x="Average depth",y="SD")+
# theme(
# panel.background = element_blank(),
# panel.border = element_rect(colour = "black", fill=NA, size=0.4),
# text = element_text(size = 12))
# p
# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
### 为了画进流程图
p <- ggplot(df, aes(x=AverageDepth, y=SD)) +
geom_hex(bins = 30, size = 0.01, color = "black") +
scale_fill_viridis_c(option = "C")+
scale_y_continuous(limits=c(3,8),breaks = seq(0,8,2))+
scale_x_continuous(limits=c(6,12),breaks = seq(0,12,2))+
labs(x="Average depth",y="SD")+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.2),
axis.ticks = element_line(size=0.2),
legend.position = "none",
text = element_text(size = 6.5))
p## Warning: Removed 4 rows containing non-finite values (stat_binhex).
# ggsave(p,filename = "~/Documents/test.pdf",height = 1.2,width = 2)
ggsave(p,filename = "~/Documents/test.pdf",height = 1.238,width = 1.315)## Warning: Removed 4 rows containing non-finite values (stat_binhex).
library(cowplot)
library(ggExtra)
library(forcats)
dataFile <- c("data/003_VCF_QC/taxaDepth/aabb.summary.txt")
df1 <- read.delim(dataFile)
dataFile <- c("data/003_VCF_QC/taxaDepth/aabbdd.summary.txt")
df2 <- read.delim(dataFile)
dataFile <- c("data/003_VCF_QC/taxaDepth/dd.summary.txt")
df3 <- read.delim(dataFile)
df <- rbind(df1,df2,df3) %>% select(-ID)
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>% select(Taxa = VcfID,GenomeType)
### 添加列信息
df1 <- df %>% left_join(.,dftaxaDB,by= "Taxa")
colB <- c('#ffd702','#fc6e6e',"#87cef9")
p <- df1 %>%
ggplot(aes(x=MeanDepth,fill =GenomeType))+
geom_histogram( alpha=0.95,bins = 50,color="white",size=0.3)+
scale_fill_manual(values = colB)+
scale_x_continuous(limits = c(0,28))+
theme(
legend.position = c(0.7,0.7),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12))
p## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
### for data pipeline 小图
p <- df1 %>%
ggplot(aes(x=MeanDepth,fill =GenomeType))+
geom_histogram( alpha=0.95,bins = 30,color="white",size=0.01)+
scale_fill_manual(values = colB)+
scale_x_continuous(limits = c(0,25))+
theme(
# legend.position = c(0.7,0.7),
panel.background = element_blank(),
axis.ticks = element_line(size=0.2),
panel.border = element_rect(colour = "black", fill=NA, size=0.2),
legend.position="none",
text = element_text(size = 6.5))
p## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
ggsave(p,filename = "~/Documents/test.pdf",height = 1.238,width = 1.315)## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
### 求每个倍性的深度平均值
dfave <- df1 %>% group_by(GenomeType) %>% summarise(mean=mean(MeanDepth))inputDir <- c("data/003_VCF_QC/maf/aabb")
dfoutM <- NULL
fsList <- list.files(inputDir) ### 列出文件名字
length(fsList)## [1] 28
cnt <- 0
dflog <- NULL
for (i in fsList) {
df <- read.delim(file.path(inputDir,i))
chr <- str_sub(i,4,6)
dfout <- df %>% mutate(Total = MafMore005 + MAFLess005) ### data operate
cnt <- cnt+nrow(dfout)
### output file into one
dfoutM <- rbind(dfoutM,dfout)
### log
log <- paste(i, nrow(dfout),sep = "\t")
dflog2 <- data.frame(filename = i, LineA = nrow(df))
dflog <- rbind(dflog,dflog2)
# print(log) ### 打印当前文件的名字和行数
}
# print(cnt) ### 打印总行数
# write.table(dflog,file="~/Documents/log.txt",quote=F,col.name=T,row.names=F,sep = "\t")
dfV1 <- dfoutM %>% summarise(MafMore005 = sum(MafMore005), MafLess005 = sum(MAFLess005)) %>%
pivot_longer(.,cols = c(1:2),names_to = "MafType",values_to = "n") %>%
mutate(GenomeType = "AABB") %>%
mutate(fre = n/sum(n))inputDir <- c("data/003_VCF_QC/maf/aabbdd")
dfoutM <- NULL
fsList <- list.files(inputDir) ### 列出文件名字
length(fsList)## [1] 42
cnt <- 0
dflog <- NULL
for (i in fsList) {
df <- read.delim(file.path(inputDir,i))
chr <- str_sub(i,4,6)
dfout <- df %>% mutate(Total = MafMore005 + MAFLess005) ### data operate
cnt <- cnt+nrow(dfout)
### output file into one
dfoutM <- rbind(dfoutM,dfout)
### log
log <- paste(i, nrow(dfout),sep = "\t")
dflog2 <- data.frame(filename = i, LineA = nrow(df))
dflog <- rbind(dflog,dflog2)
# print(log) ### 打印当前文件的名字和行数
}
# print(cnt) ### 打印总行数
# write.table(dflog,file="~/Documents/log.txt",quote=F,col.name=T,row.names=F,sep = "\t")
dfV2 <- dfoutM %>% summarise(MafMore005 = sum(MafMore005), MafLess005 = sum(MAFLess005)) %>%
pivot_longer(.,cols = c(1:2),names_to = "MafType",values_to = "n") %>%
mutate(GenomeType = "AABBDD") %>%
mutate(fre = n/sum(n))inputDir <- c("data/003_VCF_QC/maf/dd")
dfoutM <- NULL
fsList <- list.files(inputDir) ### 列出文件名字
length(fsList)## [1] 14
cnt <- 0
dflog <- NULL
for (i in fsList) {
df <- read.delim(file.path(inputDir,i))
chr <- str_sub(i,4,6)
dfout <- df %>% mutate(Total = MafMore005 + MAFLess005) ### data operate
cnt <- cnt+nrow(dfout)
### output file into one
dfoutM <- rbind(dfoutM,dfout)
### log
log <- paste(i, nrow(dfout),sep = "\t")
dflog2 <- data.frame(filename = i, LineA = nrow(df))
dflog <- rbind(dflog,dflog2)
# print(log) ### 打印当前文件的名字和行数
}
# print(cnt) ### 打印总行数
# write.table(dflog,file="~/Documents/log.txt",quote=F,col.name=T,row.names=F,sep = "\t")
dfV3 <- dfoutM %>% summarise(MafMore005 = sum(MafMore005), MafLess005 = sum(MAFLess005)) %>%
pivot_longer(.,cols = c(1:2),names_to = "MafType",values_to = "n") %>%
mutate(GenomeType = "DD") %>%
mutate(fre = n/sum(n))df <- rbind(dfV1,dfV2,dfV3)
colB <- c("#ffd702","#fc6e6e","#87cef9")
df$MafType <- factor(df$MafType,levels = c("MafLess005","MafMore005"),labels = c("MAF <= 0.05","MAF > 0.05"))
p <- df %>%
ggplot(aes(x=MafType,y=n,fill=GenomeType))+
geom_bar(stat = 'identity',position = "dodge")+
geom_text(aes(label=paste("\n(",round(fre,2),")",sep = "")),size=3,
position = position_dodge(0.9))+
# geom_text(aes(label=paste(n,"\n(",round(fre,2),")",sep = "")),size=3,nudge_y = 1)+
labs(x="", y="SNP count")+
# scale_fill_brewer(palette = "Set2")+
scale_fill_manual(values = colB)+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 14),legend.position = 'none')+
theme(axis.text.x = element_text(size = 14))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 4)library(tidyverse)
library(cowplot)
dataFile <- c("data/003_VCF_QC/AABB_site_QC.txt.gz")
df1 <- read.delim(dataFile)%>% select(GenomeType,Maf)
dataFile <- c("data/003_VCF_QC/AABBDD_site_QC.txt.gz")
df2 <- read.delim(dataFile)%>% select(GenomeType,Maf)
dataFile <- c("data/003_VCF_QC/DD_site_QC.txt.gz")
df3 <- read.delim(dataFile)%>% select(GenomeType,Maf)
df <- rbind(df1,df2,df3)
df <- df %>%
filter(Maf > 0) %>%
# mutate(bin = cut(Maf, breaks=c(0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5))) %>%
mutate(bin = cut(Maf, breaks=c(0,0.1,0.2,0.3,0.4,0.5))) %>%
group_by(GenomeType) %>%
count(bin) %>%
mutate(fre = n/sum(n)) %>%
arrange(GenomeType,bin)
colB <- c("#ffd702","#fc6e6e","#87cef9")
### 直方图形式
p <- df %>%
ggplot(aes(x=bin,y=fre,group=GenomeType,fill=GenomeType))+
geom_bar(stat = 'identity',position = "dodge")+
labs(y="Proportion",x="Minor allele frequency bins")+
# scale_color_brewer(palette = "Set2")+
scale_fill_manual(values = colB)+
scale_y_continuous(limits = c(0,1))+
# scale_x_discrete(labels=c(0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5))+
scale_x_discrete(labels=c(0.1,0.2,0.3,0.4,0.5))+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12),legend.position = 'none')+
theme(axis.text.x = element_text(size = 10))
# theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
# panel.background = element_blank(),
# axis.line = element_line(size=0.4, colour = "black"),
# text = element_text(size = 12),legend.position = 'none')+
# theme(axis.text.x = element_text(size = 10))
pggsave(p,filename = "~/Documents/test.pdf",height = 2.57,width = 4)
### 线图形式
p <- df %>%
ggplot(aes(x=bin,y=fre,group=GenomeType,color=GenomeType))+
geom_line(aes(linetype=GenomeType))+
scale_linetype_manual(values=c("solid","dotdash","dashed"))+
labs(y="Proportion",x="Minor allele frequency bins")+
# scale_color_brewer(palette = "Set2")+
scale_color_manual(values = colB)+
scale_y_continuous(limits = c(0,1))+
# scale_x_discrete(labels=c(0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5))+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12),legend.position = 'none')+
theme(axis.text.x = element_text(size = 10))
# theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
# panel.background = element_blank(),
# axis.line = element_line(size=0.4, colour = "black"),
# text = element_text(size = 12),legend.position = 'none')+
# theme(axis.text.x = element_text(size = 10))
pggsave(p,filename = "~/Documents/test2.pdf",height = 2.57,width = 4)library(tidyverse)
library(ggpubr)
dataFile <- c("data/003_VCF_QC/indiviHeterAlongChrom/chr1A1B_Wild_emmer_RH_2000000_1000000.txt")
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
head(df)## CHROM BIN_START BIN_END N_VARIANTS HETEROZYGOSITY Taxa
## 1 1B 125000001 127000001 0 NaN PI362699
## 2 1B 319000001 321000001 0 NaN PI362699
## 3 1B 125000001 127000001 0 NaN PI487260
## 4 1B 319000001 321000001 0 NaN PI487260
## 5 1B 125000001 127000001 0 NaN PI94648
## 6 1B 319000001 321000001 0 NaN PI94648
df <- df %>% filter(CHROM == "1A")
## heter order: W11 W20 W19 W14 W15 W13 W16 W18 PI487260 W5 W7 W8 PI362699 PI94648
# df <- subset(df,df$Taxa == "W11" | df$Taxa == "W15" | df$Taxa == "W18"| df$Taxa == "W7")
p1 <- ggplot()+
# stat_smooth(col = colB,method = "gam", formula = y ~ s(x,k=95), size = 1,se=FALSE) +
# geom_line(aes(col=Taxa),size=0.3)+
geom_area(data = df, mapping = aes(x=BIN_START/1000000,y=df$HETEROZYGOSITY,fill=Taxa),alpha=0.7) +
geom_point(aes(x=213,y=0),color="blue")+
xlab("Physical distance(Mb)")+ylab("Residual heterozygosity") +
theme_classic()+
scale_x_continuous(breaks=seq(0, 800, 100), expand = c(0.01, 0))
p1## Warning: Use of `df$HETEROZYGOSITY` is discouraged. Use `HETEROZYGOSITY`
## instead.
ggsave(p1,filename = "~/Documents/test.pdf",height = 4,width = 4)## Warning: Use of `df$HETEROZYGOSITY` is discouraged. Use `HETEROZYGOSITY`
## instead.
chrS <- rep(str_c(rep(1:7,each=3),c("A","B","D")),each=2)
df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/003_VCF_QC/variantsRate/variantsRate_byChrID.txt") %>%
mutate(Chr=chrS) %>%
group_by(Chr) %>%
summarise(N_BiSNPs=sum(N_BiallelicSNPs),ChrLength=sum(Length)) %>%
mutate(VariantRate=ceiling(ChrLength/N_BiSNPs))## Rows: 42 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (3): ID_Chr_Part, N_BiallelicSNPs, Length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p <- ggplot(df, aes(x=Chr, y=VariantRate)) +
geom_segment(aes(x=Chr, xend=Chr, y=0, yend=VariantRate),color="grey")+
geom_point( color="orange", size=4) +
xlab("") +
ylab("Variant rate") +
scale_y_continuous(limits = c(0,110), breaks = seq(0,110,25))+
# theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 12),legend.position = 'none')+
theme(axis.text.x = element_text(size = 9))
p# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.5,width = 6)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.5,width = 6)
#####
################################# ##
# #
# 求每个亚基因组的均值
# #
################################# ##
out <- df %>%
mutate(Sub = substring(Chr,2,2)) %>%
group_by(Sub) %>%
summarise(sum1=sum(as.numeric(ChrLength)),sum2=sum(as.numeric(N_BiSNPs)), variantRatebySub=ceiling(sum1/sum2))
out## # A tibble: 3 × 4
## Sub sum1 sum2 variantRatebySub
## <chr> <dbl> <dbl> <dbl>
## 1 A 4934891648 97988886 51
## 2 B 5180314468 102836233 51
## 3 D 3951074735 39655547 100
q <- p +
annotate("text",x=-Inf,y= Inf ,hjust=-0.1,vjust=1.5,label=str_c("A sub (Variant rate) = ",out$variantRatebySub[1]," bp"),size=3)+
annotate("text",x=-Inf,y= Inf ,hjust=-0.1,vjust=3,label=str_c("B sub (Variant rate) = ",out$variantRatebySub[2]," bp"),size=3)+
annotate("text",x=-Inf,y= Inf ,hjust=-0.1,vjust=4.5,label=str_c("D sub (Variant rate) = ",out$variantRatebySub[3]," bp"),size=3)
qggsave(q,filename = "~/Documents/test.pdf",height = 3,width = 4)df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/017_VMap2_SNPcount/001_VMap2_SNP_count.txt") %>%
mutate(Sub=str_sub(Chr,5)) ## Rows: 42 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Chr_Part, Chr
## dbl (11): ID_Chr_Part, Length, Pos_StartIndex(Inclusive), Pos_EndIndex(Exclu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 求每一列的和
df_sum <- df %>%
group_by(Sub) %>%
summarise(across(.cols = c(TotalVariants:Delection_Num,Gene_Region), .fns = sum))
df_sum## # A tibble: 3 × 7
## Sub TotalVariants BiallelicSNP_Num Indel_Num Insertion_Num Delection_Num
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 86700250 83875573 2824677 883009 1941668
## 2 B 91923417 88945931 2977486 921297 2056189
## 3 D 24275953 23136814 1139139 418358 720781
## # … with 1 more variable: Gene_Region <dbl>
df_sum <- df %>%
summarise(across(.cols = c(TotalVariants:Delection_Num,Gene_Region), .fns = sum))
df_sum## # A tibble: 1 × 6
## TotalVariants BiallelicSNP_Num Indel_Num Insertion_Num Delection_Num
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 202899620 195958318 6941302 2222664 4718638
## # … with 1 more variable: Gene_Region <dbl>
biSNP_Num <- sum(df$BiallelicSNP_Num)
print(str_c("biSNP_Num: ",biSNP_Num))## [1] "biSNP_Num: 195958318"
# library(RIdeogram)
# library(dplyr)
# # wheatGff<-"IWGSC_v1.1_HC_20170706.gff3";
#
# wheatKaryotype<-"data/022_snpDensity/wheatV1.1_karyotype.txt";
# # data(human_karyotype, package="RIdeogram")
# # data(gene_density, package="RIdeogram")
# # data(Random_RNAs_500, package="RIdeogram")
# # head(human_karyotype)
# # head(gene_density)
# # head(Random_RNAs_500)
# # Random_RNAs_500<-Random_RNAs_500
# # wheatGeneDensity<-GFFex(input = wheatGff, karyotype = wheatKaryotype, feature = "gene", window = 1000000)
# vmap2.1SNPDensity<- read.table("data/022_snpDensity/vmap2.1.posAllele_10000000window_1000000step.txt", header = T, stringsAsFactors = F)
#
# # azhurnaya_tissue_breath<-read.table("Azhurnaya_tissue_breadth10MbWindow1MbStep.txt", header = T, stringsAsFactors = F)
# # azhurnaya_tissue_breath<-mutate(azhurnaya_tissue_breath, Color="cccdfe")
# wheatKaryotype<-read.table(wheatKaryotype,header = T,stringsAsFactors = F)
# color<-c("#2787D8","#D2DD14","#F32F0C")
# # color<-c("#16BFFD","#ffcc00","#CB3066")
# # ideogram(karyotype = wheatKaryotype, overlaid = vmap2.1SNPDensity, label = azhurnaya_tissue_breath,label_type = "polygon",Ly = 25, colorset1 = color)
# ideogram(karyotype = wheatKaryotype, overlaid = vmap2.1SNPDensity, colorset1 = color,Lx = 6)
# # convertSVG("chromosome.svg", device = "png",file = "vmap2.1SNPDensity_Azhurnaya_tissue_breadth.png",dpi = 300, width = 6, height = 6)
# convertSVG("chromosome.svg", device = "pdf",file = "vmap2.1SNPDensity111111.pdf",dpi = 300, width = 3.6, height = 3.6*3/4)### 合并文件
# inputDir <- c("data/019_rawVCF_count/001_rawVCF_count")
#
# df <- dir(inputDir,full.names = T) %>%
# map(~read_tsv(.x,show_col_types = FALSE) %>% mutate(Chr=as.numeric(str_sub(basename(.x),4,6)))) %>%
# bind_rows()
#
# write_tsv(df,"data/019_rawVCF_count/002_merge001/001_vmap2.alt.count.txt.gz")
### 打印每个文件的行数,看看具体有多少种情况
df <- read_tsv("data/019_rawVCF_count/002_merge001/001_vmap2.alt.count.txt.gz") %>%
group_by(Chr) %>%
count()## Rows: 2114 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Case
## dbl (2): Count, Chr
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##### conclusion: 46-51 cases
### 统计Alt只含有 A T G C 中的一种,一共有多少SNP
df <- read_tsv("data/019_rawVCF_count/002_merge001/001_vmap2.alt.count.txt.gz") %>%
mutate(CaseGroup = ifelse(Case %in% c("A","T","G","C"),"One","Out")) %>%
filter(CaseGroup=="One") %>%
summarise(sum=sum(Count))## Rows: 2114 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Case
## dbl (2): Count, Chr
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.